# import libraries
import pandas as pd
import plotly.express as px
from datetime import datetime, timedelta
import seaborn as sns
from matplotlib import pyplot as plt
import numpy as np
from scipy import stats as st
%matplotlib inline
import plotly.graph_objects as go
import math as mth
from plotly import graph_objects as go
import warnings
warnings.filterwarnings('ignore')
#function for the preliminary dataset analysis
def analyze (dataset):
dataset.columns = [x.lower().replace(' ', '_') for x in dataset.columns.values]
dataset.info()
display('*'*50)
display(dataset.describe())
display('*'*50)
display(dataset.head())
display('*'*50)
display('Number of duplicate values', dataset.duplicated().sum())
df = pd.read_csv('/datasets/logs_exp.csv', sep='\t')
analyze(df)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 244126 entries, 0 to 244125 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 eventname 244126 non-null object 1 deviceidhash 244126 non-null int64 2 eventtimestamp 244126 non-null int64 3 expid 244126 non-null int64 dtypes: int64(3), object(1) memory usage: 7.5+ MB
'**************************************************'
| deviceidhash | eventtimestamp | expid | |
|---|---|---|---|
| count | 2.441260e+05 | 2.441260e+05 | 244126.000000 |
| mean | 4.627568e+18 | 1.564914e+09 | 247.022296 |
| std | 2.642425e+18 | 1.771343e+05 | 0.824434 |
| min | 6.888747e+15 | 1.564030e+09 | 246.000000 |
| 25% | 2.372212e+18 | 1.564757e+09 | 246.000000 |
| 50% | 4.623192e+18 | 1.564919e+09 | 247.000000 |
| 75% | 6.932517e+18 | 1.565075e+09 | 248.000000 |
| max | 9.222603e+18 | 1.565213e+09 | 248.000000 |
'**************************************************'
| eventname | deviceidhash | eventtimestamp | expid | |
|---|---|---|---|---|
| 0 | MainScreenAppear | 4575588528974610257 | 1564029816 | 246 |
| 1 | MainScreenAppear | 7416695313311560658 | 1564053102 | 246 |
| 2 | PaymentScreenSuccessful | 3518123091307005509 | 1564054127 | 248 |
| 3 | CartScreenAppear | 3518123091307005509 | 1564054127 | 248 |
| 4 | PaymentScreenSuccessful | 6217807653094995999 | 1564055322 | 248 |
'**************************************************'
'Number of duplicate values'
413
df.rename(columns = {'eventname' : 'event_type', 'deviceidhash' : 'user_id', 'eventtimestamp' : 'event_datetime'}, inplace = True)
df = df.drop_duplicates()
df['event_datetime'] = pd.to_datetime(df['event_datetime'], unit='s')
df['date'] = pd.DatetimeIndex(df['event_datetime']).date
df.head(10)
| event_type | user_id | event_datetime | expid | date | |
|---|---|---|---|---|---|
| 0 | MainScreenAppear | 4575588528974610257 | 2019-07-25 04:43:36 | 246 | 2019-07-25 |
| 1 | MainScreenAppear | 7416695313311560658 | 2019-07-25 11:11:42 | 246 | 2019-07-25 |
| 2 | PaymentScreenSuccessful | 3518123091307005509 | 2019-07-25 11:28:47 | 248 | 2019-07-25 |
| 3 | CartScreenAppear | 3518123091307005509 | 2019-07-25 11:28:47 | 248 | 2019-07-25 |
| 4 | PaymentScreenSuccessful | 6217807653094995999 | 2019-07-25 11:48:42 | 248 | 2019-07-25 |
| 5 | CartScreenAppear | 6217807653094995999 | 2019-07-25 11:48:43 | 248 | 2019-07-25 |
| 6 | OffersScreenAppear | 8351860793733343758 | 2019-07-25 14:50:42 | 246 | 2019-07-25 |
| 7 | MainScreenAppear | 5682100281902512875 | 2019-07-25 20:14:37 | 246 | 2019-07-25 |
| 8 | MainScreenAppear | 1850981295691852772 | 2019-07-25 20:31:42 | 247 | 2019-07-25 |
| 9 | MainScreenAppear | 5407636962369102641 | 2019-07-26 03:35:12 | 246 | 2019-07-26 |
We prepared the dataset for the further analysis.
events = len(df['user_id'])
users = len(df['user_id'].unique())
average_events_per_user = events / users
display('Total number of events', events)
display('Total number of users', users)
display('Average number of events per user', '{:.2f}'.format(average_events_per_user))
'Total number of events'
243713
'Total number of users'
7551
'Average number of events per user'
'32.28'
print('The latest date', df['date'].max())
The latest date 2019-08-07
print('The earliest date', df['date'].min())
The earliest date 2019-07-25
We have data for the period from July 25 to August 7, 2019.
df['event_datetime'].hist(figsize = (15,7), bins=200)
plt.title('Histogram by date and time')
plt.ylabel('Number of events')
plt.show()
We see a very small number of observations for the period from 07/25/2019 till 08/01/2019. We know that technically for some users events from the past can be recorded in the logs in the upcoming days, which causes skew data. The histogram shows a noticeable increase in observations for the period 07/31/2019 till 08/01/2019. Let's look at the histogram by the time for the events on July 31.
pointdate = pd.to_datetime("2019-07-31").date()
df_temp = df[df['date'] == pointdate]
df_temp['event_datetime'].hist(figsize = (15,7), bins=500)
plt.title('Histogram by date and time')
plt.ylabel('Number of events')
plt.show()
We see a significant increase in number of observations at 9 p.m. We can remove all the data before this time. As a result, we'll have the data beginning from 9 p.m. 07/01/2019 till 08/07/2019.
pointdate = pd.to_datetime("2019-07-31 21:00:00")
data = df[df['event_datetime'] > pointdate]
#check
data['event_datetime'].min()
Timestamp('2019-07-31 21:00:57')
print('Removed events', len(df['user_id'])-len(data['user_id']))
print('Share of removed events in raw data', '{:.2%}'.format((len(df['user_id'])-len(data['user_id']))/len(df['user_id'])))
Removed events 1989 Share of removed events in raw data 0.82%
With this truncation, we lost a negligible percentage of the original data. Let's check that we have users from all three experimental groups.
data['expid'].value_counts()
248 84875 246 79556 247 77293 Name: expid, dtype: int64
events_count = data.groupby('event_type').agg({'user_id': 'count'}).sort_values(by='user_id', ascending=False)
print(events_count)
user_id event_type MainScreenAppear 117889 OffersScreenAppear 46531 CartScreenAppear 42343 PaymentScreenSuccessful 33951 Tutorial 1010
There are five type of events in the logs, the rearest is an appeal to the tutorial, most often - the appearance of the main screen. In the Offer-Cart-Purchase subsequence, the number of users decreases.
funnel = data.groupby('event_type').agg({'user_id':'nunique'})\
.sort_values(by='user_id', ascending=False).reset_index(level=[0,0])
#% from number of unique users
funnel['percent'] = funnel['user_id'] / data ['user_id'].nunique()
with pd.option_context('display.float_format', '{:.2%}'.format):
display(funnel)
| event_type | user_id | percent | |
|---|---|---|---|
| 0 | MainScreenAppear | 7423 | 98.47% |
| 1 | OffersScreenAppear | 4597 | 60.98% |
| 2 | CartScreenAppear | 3736 | 49.56% |
| 3 | PaymentScreenSuccessful | 3540 | 46.96% |
| 4 | Tutorial | 843 | 11.18% |
Apparently, the events are arranged in the following order:
It is unclear at what stage there is an appeal to the tutorial. We make an assumption this event can be ignored for further analysis.
#removing tutorial event
funnel = funnel.query('event_type != "Tutorial"')
funnel['conversion'] = funnel['user_id']/funnel['user_id'].shift(periods=1)
#the first event in the funnel is 100%
funnel = funnel.fillna(1.00)
with pd.option_context('display.float_format', '{:.2%}'.format):
display(funnel)
| event_type | user_id | percent | conversion | |
|---|---|---|---|---|
| 0 | MainScreenAppear | 7423 | 98.47% | 100.00% |
| 1 | OffersScreenAppear | 4597 | 60.98% | 61.93% |
| 2 | CartScreenAppear | 3736 | 49.56% | 81.27% |
| 3 | PaymentScreenSuccessful | 3540 | 46.96% | 94.75% |
The maximum user lost occurs at the second step of the funnel (OffersScreenAppear).
percent = funnel.iloc[3,2] / funnel.iloc[0,2]
print("Share of users went throug the full funnel:", '{:.2%}'.format(percent))
Share of users went throug the full funnel: 47.69%
# funnel chart
fig = go.Figure(
go.Funnel(
y = funnel['event_type'],
x = funnel['user_id'],
)
)
fig.update_layout(title_text='Funnel chart')
fig.show()
users_total = data.groupby('expid').agg(total_number_of_users=('user_id','nunique'))
users_total
| total_number_of_users | |
|---|---|
| expid | |
| 246 | 2484 |
| 247 | 2517 |
| 248 | 2537 |
The discrepancy in numbers between the groups is about 1%. There are 2 control groups for the A/A experiment - 246 and 247. Let's check if there is a difference between samples 246 and 247 (the hypothesis of equality of shares).
data.columns
Index(['event_type', 'user_id', 'event_datetime', 'expid', 'date'], dtype='object')
246+247 - combined control groups.
#removing an unnecessary event
data = data.query('event_type != "Tutorial"')
summary = data.pivot_table(index='event_type', columns='expid',values='user_id',aggfunc='nunique')
summary['control'] = summary[246]+summary[247]
summary
| expid | 246 | 247 | 248 | control |
|---|---|---|---|---|
| event_type | ||||
| CartScreenAppear | 1266 | 1239 | 1231 | 2505 |
| MainScreenAppear | 2450 | 2479 | 2494 | 4929 |
| OffersScreenAppear | 1542 | 1524 | 1531 | 3066 |
| PaymentScreenSuccessful | 1200 | 1158 | 1182 | 2358 |
The most popular event is MainScreenAppear. Let's count the number of users who committed this event in each of the control groups.
MainScreenAppear_users = data.query('event_type=="MainScreenAppear"')\
.groupby('expid').agg(number_of_users=('user_id','nunique'))
MainScreenAppear_users['share'] = MainScreenAppear_users['number_of_users'] / users_total['total_number_of_users']
with pd.option_context('display.float_format', '{:.2%}'.format):
display(MainScreenAppear_users)
| number_of_users | share | |
|---|---|---|
| expid | ||
| 246 | 2450 | 98.63% |
| 247 | 2479 | 98.49% |
| 248 | 2494 | 98.31% |
More than 98% of users take this action in each of the groups.
H0 - there are no differences between the shares, H1 - there are differences between the shares.
#function for conducting the test
def test(successes1, successes2, trials1, trials2, alpha):
p1 = successes1 / trials1
p2 = successes2 / trials2
# the proportion of success in the combined dataset:
p_combined = (successes1 + successes2) / (trials2 + trials1)
# the difference in proportions in datasets
difference = p1 - p2
# we count statistics in the standard deviations of the standard normal distribution
z_value = difference / mth.sqrt(p_combined * (1 - p_combined) * (1/trials1 + 1/trials2))
# we set the standard normal distribution (mean 0, standard deviation 1)
distr = st.norm(0, 1)
p_value = (1 - distr.cdf(abs(z_value))) * 2
print('p-value: ', p_value)
if p_value < alpha:
print('Reject the null hypothesis: there is a significant difference between the shares')
else:
print('It was not possible to reject the null hypothesis, there is no reason to consider the shares different')
H0 - there are no differences between the shares, H1 - there are differences between the shares.
events = ['MainScreenAppear', 'CartScreenAppear', 'OffersScreenAppear', 'PaymentScreenSuccessful']
for event in events:
print('Test for event:', event)
test(summary.loc[event, 246], summary.loc[event, 247], users_total.loc[246],users_total.loc[247], 0.01)
print(' '*50)
Test for event: MainScreenAppear
p-value: [0.67562177]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for event: CartScreenAppear
p-value: [0.21828121]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for event: OffersScreenAppear
p-value: [0.26698769]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for event: PaymentScreenSuccessful
p-value: [0.10298395]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
There is no statistically significant difference between the shares, the division into groups was correct. 248 - the group with the changed font. We will conduct a similar test for her.
H0 - there are no differences between the shares, H1 - there are differences between the shares.
for event in events:
print('Test for the event:', event)
test(summary.loc[event, 246], summary.loc[event, 248], users_total.loc[246],users_total.loc[248], 0.05)
print(' '*50)
Test for the event: MainScreenAppear
p-value: [0.34705881]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: CartScreenAppear
p-value: [0.08328413]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: OffersScreenAppear
p-value: [0.20836205]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: PaymentScreenSuccessful
p-value: [0.22269359]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
H0 - there are no differences between the shares, H1 - there are differences between the shares.
for event in events:
print('Test for the event:', event)
test(summary.loc[event, 247], summary.loc[event, 248], users_total.loc[247],users_total.loc[248], 0.05)
print(' '*50)
Test for the event: MainScreenAppear
p-value: [0.60016616]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: CartScreenAppear
p-value: [0.61695175]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: OffersScreenAppear
p-value: [0.88359567]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: PaymentScreenSuccessful
p-value: [0.67754136]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Let's compare the results with the combined control group.
H0 - there are no differences between the shares, H1 - there are differences between the shares.
#calculate the total number of users in the control group
control_total = users_total.loc[246, 'total_number_of_users'] + users_total.loc[247, 'total_number_of_users']
for event in events:
print('Test for the event:', event)
test(summary.loc[event, 248], summary.loc[event, 'control'], users_total.loc[248], control_total, 0.05)
print(' '*50)
Test for the event: MainScreenAppear
p-value: [0.39298915]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: CartScreenAppear
p-value: [0.19819341]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: OffersScreenAppear
p-value: [0.41899828]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
Test for the event: PaymentScreenSuccessful
p-value: [0.64520577]
It was not possible to reject the null hypothesis, there is no reason to consider the shares different
We made 16 tests of statistical hypotheses using a significance level of 1% for control groups and a significance level of 5% for the rest.
The tests did not show a statistically significant difference between the groups - changing fonts in the application does not scare away users on their way through the funnel.
We lose the most users at the second step of the funnel - OffersScreenAppear, it's probably worth considering how to increase user interaction at this step.